Cluster Analysis

Possible cluster interpretations:

  • 3 clusters
    • Competency/mastery
    • Equity
    • College and career readiness
  • 4 clusters
    • Social justice and equity
    • College and career prep
    • Interdisciplinary thinking (-not language)
    • Blended and flexible learning
  • 5 clusters
    • Blended and flexible learning
    • Social justice and equity
    • Leadership/student coleaders
    • College and career prep
    • Not language and some interdisciplinary things (possibly based on some things least frequently chosen)
  • 6 clusters
    • Social justice and equity
    • Leadership/student coleaders
    • Competency/mastery
    • College and career prep
    • Blended and flexible learning
    • Interdisciplinary learning

Retaining either 3 or 6 clusters seems to have be most clear distinctions.

FA process is below.

Correlations

First we calculate a correlation matrix for the tags.

practices_cor <- tags %>%
  select(starts_with("practices")) %>%
  cor

practices_jac <- tags %>%
  select(starts_with("practices")) %>%
  simil(method = "Jaccard", by_rows = FALSE)

plot_tag_cor <- function(cor_mat, title = "") {
  ggcorrplot(cor_mat, hc.order = T, type = "upper") +
    scale_fill_distiller(type = "div", limits = c(-1, 1), expand = c(0, 0)) +
    labs(title = title,
         fill = "Correlation") +
    scale_x_discrete(labels = label_tags()) +
    scale_y_discrete(labels = label_tags()) +
    labs(x = "", y = "") + 
#    scale_fill_cc_gradient + 
    theme_transcend_sparse + 
    theme(axis.text.x = element_blank(), 
          axis.text.y = element_blank(),
          panel.border = element_blank(),
          axis.ticks = element_blank()
          )
}
cor_plot <- plot_tag_cor(practices_cor, 
                         title = "Correlation heat map for all tags") +
            theme(legend.position = c(.85, .25))

ggplotly(cor_plot)
jac_plot <- plot_tag_cor(as.matrix(practices_jac), 
                         title = "Jaccard similarity for all tags") +
  theme(legend.position = c(.85, .25)) + 
  scale_fill_distiller(type = "div", limits = c(0, 1), expand = c(0, 0))

ggplotly(jac_plot)
mat_to_df <- function(m) {
  data.frame(row=rownames(m)[row(m)[upper.tri(m)]], 
           col=colnames(m)[col(m)[upper.tri(m)]], 
           corr=m[upper.tri(m)])
}

d_corr <- mat_to_df(practices_cor)
d_jacc <- mat_to_df(as.matrix(practices_jac))

jacc_corr <- d_jacc %>%
  rename(jaccard = corr) %>%
  left_join(d_corr, by = c("row", "col")) %>%
  rename(correlation = corr) %>%
  mutate(
    jacc_rank = rank(-jaccard),
    corr_rank = rank(-correlation),
    rank_diff = jacc_rank - corr_rank
  ) %>% 
  arrange(desc(jaccard))

jacc_corr %>% 
  datatable(
    caption = "Comparison of Jaccard Similarity and Correlation Coeffficients",
    colnames = c("Jaccard (0,1)" = "jaccard", "Correlation (-1,1)" = "correlation")
  ) %>%
  formatRound(digits = 2, columns = c("Jaccard (0,1)", "Correlation (-1,1)")) 
corr_hist <- ggplot(jacc_corr, aes(x = correlation)) +
  geom_histogram(binwidth = 0.03, fill = transcend_cols[1]
                 ) +
  geom_vline(aes(xintercept = mean(correlation)), 
             color = transcend_cols[3], 
             size = 1) +
  geom_text(aes(x = mean(correlation), y = Inf, 
                label = paste("Average:",round(mean(correlation), 2))), 
            hjust = -.1, check_overlap = TRUE, vjust = 1.1, 
            family = "Open Sans") + 
  bar_y_scale_count +
  scale_x_continuous(limits = c(-1, 1), expand = expansion(0, 0)) +
  labs(title = "Distribution of pairwise tag correlations", 
       y = "Count of Tag Pairs",
       x = "Correlation") +
  theme(plot.margin = margin(t = 8, r = 12, b = 8, l = 8, unit = "pt"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
jacc_hist <- ggplot(jacc_corr, aes(x = jaccard)) +
  geom_histogram(binwidth = 0.03, fill = transcend_cols[2]
                 ) +
  geom_vline(aes(xintercept = mean(jaccard)), 
             color = transcend_cols[3], 
             size = 1) +
  geom_text(aes(x = mean(jaccard), y = Inf, 
                label = paste("Average:", round(mean(jaccard), 2))), 
            hjust = -.1, check_overlap = TRUE, vjust = 1.1, 
            family = "Open Sans") + 
  bar_y_scale_count +
  scale_x_continuous(limits = c(0, 1), expand = expansion(0, 0)) +
  labs(title = "Distribution of pairwise tag similarities", 
       y = "Count of Tag Pairs",
       x = "Jaccard Similarity") +
  theme(plot.margin = margin(t = 8, r = 12, b = 8, l = 8, unit = "pt")) 

corr_hist + jacc_hist
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

Clustering

Scree Plots

fa_cor <- fa.parallel(
  practices_cor, fm = "pa", fa = "fa", n.obs = nrow(tags),
  main = "Correlation scree plot"
)

## Parallel analysis suggests that the number of factors =  11  and the number of components =  NA
jac_mat <- as.matrix(practices_jac)
jac_mat[is.na(jac_mat)] = 1

fa_jac <- fa.parallel(jac_mat, fm = "pa", fa = "fa", 
                      n.obs = nrow(tags),
                      main = "Jaccard scree plot"
)

## Parallel analysis suggests that the number of factors =  3  and the number of components =  NA
# Parallel analysis, doesn't run but can try to figure out if interested


# n_i  <- nrow(values_3) # The number of cases in our data
# n_p <- ncol(values_3) # The number of variables in our data
# 
# set.seed(2)   # To reproduce our randomly generated results.
# 
# Eigs <- pca_3$values    # The eigenvalues
# n_components  <- length(Eigs) # number of components
# 
# paral <- parallel(subject = n_i,  # The number of cases in our data
#                   var = n_p,  # The number of variables in our data
#                   rep = 1000,
#                   quantile = .95,
#                    model  = "components")
# 
# ParallelAna <- data.frame(Ncomponent  = 1:n_components,
#                            Eigs,
#                            RandEigM = paral$eigen$mevpea,
#                            RandEig95= paral$eigen$qevpea)
# 
# ParallelAna <- round(ParallelAna, 3)
# ParallelAna
# exceeder <- ParallelAna[ParallelAna[, "RandEig95"] > ParallelAna[, "Eigs"], ][1,]
# exceeder

Cluster interpretation:

  • 3 clusters
    • Competency/mastery
    • Equity
    • College and career readiness
  • 4 clusters
    • Social justice and equity
    • College and career prep
    • Interdisciplinary thinking (-not language)
    • Blended and flexible learning
  • 5 clusters
    • Blended and flexible learning
    • Social justice and equity
    • Leadership/student coleaders
    • College and career prep
    • Not language and some interdisciplinary things (possibly based on some things least frequently chosen)
  • 6 clusters
    • Social justice and equity
    • Leadership/student coleaders
    • Competency/mastery
    • College and career prep
    • Blended and flexible learning
    • Interdisciplinary learning

Retaining 3 clusters

efa_3 %>%
  model_parameters(sort = TRUE, threshold = "max") %>%
#  write_tsv(here("output", "EFA 2023 Results Max.txt"), na = "") %>%
  datatable(caption = "Max loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
efa_3 %>%
  model_parameters(sort = TRUE, threshold = 0.28) %>%
#  write_tsv(here("output", "EFA 2023 Results Threshold.txt"), na = "") %>%
  datatable(caption = "Threshold loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
efa_3 %>%
  model_parameters(sort = TRUE) %>%
#  write_tsv(here("output", "EFA 2023 Results All.txt"), na = "") %>%
  datatable(caption = "All loadings") %>%
  formatRound(digits = 2, columns = 2:6) 

Retaining 4 clusters

Pearson’s R with 4

efa_4 <- fa(practices_cor, nfactors = 4, rotate = "oblimin", fm = "minres")

# print(efa_4, sort = T)

efa_4 %>%
  model_parameters(sort = TRUE, threshold = "max") %>%
  datatable(caption = "Max loadings") %>%
  formatRound(digits = 2, columns = 2:7) 
efa_4 %>%
  model_parameters(sort = TRUE, threshold = 0.28) %>%
  datatable(caption = "Threshold loadings") %>%
  formatRound(digits = 2, columns = 2:7) 
efa_4 %>%
  model_parameters(sort = TRUE) %>%
  datatable(caption = "All loadings") %>%
  formatRound(digits = 2, columns = 2:7) 

Jaccard with 4

Retaining 5 clusters

Pearson’s R with 5

efa_5 <- fa(practices_cor, nfactors = 5, rotate = "oblimin", fm = "minres")

# print(efa_5, sort = T)

efa_5 %>%
  model_parameters(sort = TRUE, threshold = "max") %>%
  datatable(caption = "Max loadings") %>%
  formatRound(digits = 2, columns = 2:8) 
efa_5 %>%
  model_parameters(sort = TRUE, threshold = 0.28) %>%
  datatable(caption = "Threshold loadings") %>%
  formatRound(digits = 2, columns = 2:8) 
efa_5 %>%
  model_parameters(sort = TRUE) %>%
  datatable(caption = "All loadings") %>%
  formatRound(digits = 2, columns = 2:8) 

Jaccard with 5

Retaining 6 clusters

Pearson’s R with 6

efa_6 <- fa(practices_cor, nfactors = 6, rotate = "oblimin", fm = "minres")

# print(efa_6, sort = T)

efa_6 %>%
  model_parameters(sort = TRUE, threshold = "max") %>%
  datatable(caption = "Max loadings") %>%
  formatRound(digits = 2, columns = 2:9) 
efa_6 %>%
  model_parameters(sort = TRUE, threshold = .28) %>%
  datatable(caption = "Threshold loadings") %>%
  formatRound(digits = 2, columns = 2:9) 
efa_6 %>%
  model_parameters(sort = TRUE) %>%
  datatable(caption = "All loadings") %>%
  formatRound(digits = 2, columns = 2:9) 

Jaccard with 6

Clustering by School Type (Pre-K, Elementary & Middle v. Secondary)

Filter schools by type.

hs_dat <- full %>% 
  select(school_id, grades_high) %>% 
  filter(grades_high == 1)

not_hs_dat <- full %>% 
  select(school_id, grades_high) %>% 
  filter(grades_high == 0)

Filter schools by type in tags.

hs_tags <- left_join(hs_dat, tags) %>% 
  select(-grades_high)

not_hs_tags <- left_join(not_hs_dat, tags) %>% 
  select(-grades_high)

Create correlation matrices.

hs_practices_cor <- hs_tags %>%
  select(starts_with("practices")) %>%
  cor
not_hs_practices_cor <- not_hs_tags %>%
  select(starts_with("practices")) %>%
  cor

Retaining 3 clusters

High School

hs_efa_3 <- fa(hs_practices_cor, nfactors = 3, rotate = "oblimin", fm = "minres")

hs_efa_3 %>%
  model_parameters(sort = TRUE, threshold = "max") %>%
  datatable(caption = "Max loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
hs_efa_3 %>%
  model_parameters(sort = TRUE, threshold = 0.28) %>%
  datatable(caption = "Threshold loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
hs_efa_3 %>%
  model_parameters(sort = TRUE) %>%
  datatable(caption = "All loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
  • 1 - Social justice and equity
  • 2 - Leadership
  • 3 - Deep learning

Not High School

not_hs_efa_3 <- fa(not_hs_practices_cor, nfactors = 3, rotate = "oblimin", fm = "minres")

not_hs_efa_3 %>%
  model_parameters(sort = TRUE, threshold = "max") %>%
  datatable(caption = "Max loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
not_hs_efa_3 %>%
  model_parameters(sort = TRUE, threshold = 0.28) %>%
  datatable(caption = "Threshold loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
not_hs_efa_3 %>%
  model_parameters(sort = TRUE) %>%
  datatable(caption = "All loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
  • 1 - Competency & mastery
  • 2 - Equity & social justice
  • 3 - Mental health & holistic, trauma-informed supports

Retaining 5 clusters

High School

hs_efa_5 <- fa(hs_practices_cor, nfactors = 5, rotate = "oblimin", fm = "minres")

hs_efa_5 %>%
  model_parameters(sort = TRUE, threshold = "max") %>%
  datatable(caption = "Max loadings") %>%
  formatRound(digits = 2, columns = 2:7) 
hs_efa_5 %>%
  model_parameters(sort = TRUE, threshold = 0.28) %>%
  datatable(caption = "Threshold loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
hs_efa_5 %>%
  model_parameters(sort = TRUE) %>%
  datatable(caption = "All loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
  • 1 - Equity & justice
  • 2 - Leadership
  • 3 - Interdisciplinary education
  • 4 - Flexible learning environments
  • 5 - Community and career development

I don’t think these clusters are as compelling/clear as just keeping 3. 6 clusters were too many, so did 5 here just to have another option.

Not High School

not_hs_efa_5 <- fa(not_hs_practices_cor, nfactors = 5, rotate = "oblimin", fm = "minres")

not_hs_efa_5 %>%
  model_parameters(sort = TRUE, threshold = "max") %>%
  datatable(caption = "Max loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
not_hs_efa_5 %>%
  model_parameters(sort = TRUE, threshold = 0.28) %>%
  datatable(caption = "Threshold loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
not_hs_efa_5 %>%
  model_parameters(sort = TRUE) %>%
  datatable(caption = "All loadings") %>%
  formatRound(digits = 2, columns = 2:6) 
  • 1 - Accommodating, competency-based education
  • 2 - Leadership
  • 3 - Equity & justice
  • 4 - Flexible learning environments
  • 5 - Mental health & holistic, trauma-informed supports

Analysis File

Create final cluster analysis

efa_2023 = fa(practices_cor, nfactors = 4, rotate = "oblimin", fm = "minres")

write_efa_files = function(efa, dir, file, threshold = 0.28) {
  efa %>%
    model_parameters(sort = TRUE, threshold = "max") %>%
    write_tsv(here(dir, paste(file, "Max.txt")), na = "")
  efa %>%
    model_parameters(sort = TRUE, threshold = threshold) %>%
    write_tsv(here(dir, paste(file, "Threshold.txt")), na = "")
  efa %>%
    model_parameters(sort = TRUE) %>%
    write_tsv(here(dir, paste(file, "All.txt")), na = "")
  invisible()
}

print_efa = function(
  efa, 
  type = c("max", "threshold", "all"), 
  threshold = 0.28,
  caption = "")
{
  type = match.arg(type)
  if(type == "max") {
    return(efa |> 
      model_parameters(sort = TRUE, threshold = "max") |>
      datatable(caption = paste(caption, "Max loadings")) |>
      formatRound(digits = 2, columns = 2:(efa$factors + 3)))
  }
  if(type == "threshold") {
    return(efa |> 
      model_parameters(sort = TRUE, threshold = threshold) |>
      datatable(caption = paste(caption, "Threshold loadings")) |>
      formatRound(digits = 2, columns = 2:(efa$factors + 3)))
  }
  if(type == "all") {
    return(efa |> 
      model_parameters(sort = TRUE) |>
      datatable(caption = paste(caption, "All loadings")) |>
      formatRound(digits = 2, columns = 2:(efa$factors + 3)))
  }
}

efa_2023 %>%
  model_parameters(sort = TRUE) %>%
  write_tsv(here("output", "EFA 2023 Results All.txt"), na = "") %>%
  datatable(caption = "All loadings") %>%
  formatRound(digits = 2, columns = 2:7)